2.1. Incomplete dataset
load("/Volumes/LGBT Project data/Multiple Imputation/d_2014_incomplete.RData")
summary( d_2014_incomplete )
sampling_strata_region calibrated_weight no.of.population sexual_identity_2014 sexual_identity_fluidity
Lidingö : 1061 Min. : 8.497 Min. : 7463 Heterosexual :18104 0 :12228
Nacka : 1058 1st Qu.: 40.581 1st Qu.: 29879 Homosexual : 245 1 : 864
Botkyrka: 790 Median : 66.588 Median : 44852 Bisexual : 397 NA's: 9158
Täby : 651 Mean : 79.311 Mean : 46291 None of the above: 1503
Danderyd: 623 3rd Qu.:103.465 3rd Qu.: 62422 NA's : 2001
Bromma : 610 Max. :450.081 Max. :108458
(Other) :17457
age sex country_of_birth education income marital_status
Min. :16.00 Male : 9867 Sweden :18178 <=9 years : 3237 Min. : 0 Never married : 7327
1st Qu.:37.00 Female:12383 Europe : 2282 10-12 years: 8036 1st Qu.: 2191 Currently married:10872
Median :51.00 Outside Europe: 1790 >=13 years :10818 Median : 3093 Other : 4051
Mean :51.18 NA's : 159 Mean : 3800
3rd Qu.:66.00 3rd Qu.: 4259
Max. :99.00 Max. :204536
"print" NA's :42
living_alone personal_support weight_strata
yes : 4272 yes :18030 15 : 911
no :16318 no : 2405 7 : 910
NA's: 1660 NA's: 1815 19 : 904
4 : 902
22 : 901
9 : 899
(Other):16823
sapply( d_2014_incomplete, class ) # all continuous variables are numeric, and all categorical variables are factor
sampling_strata_region calibrated_weight no.of.population sexual_identity_2014
"factor" "numeric" "numeric" "factor"
sexual_identity_fluidity age sex country_of_birth
"factor" "numeric" "factor" "factor"
education income marital_status living_alone
"factor" "numeric" "factor" "factor"
personal_support weight_strata
"factor" "factor"
miss_var_summary( d_2014_incomplete ) # 41.2% missing in sexual_identity_fluidity, 9.0% in sexual_identity_2014, 8.2% in personal_support, 7.5% in living_alone, 0.7% in education, and 0.2% in income
2.2. Two-level multivariate normal imputation
# specify imputation model
# fml_imp_2014 <- sexual_identity_fluidity + sexual_identity_2014 + personal_support + living_alone + education + income ~ 1 + age*sex + country_of_birth + marital_status + ( 1 | weight_strata )
# final imputation with the chosen number of iterations
# imp_final_2014 <- jomoImpute( data = d_2014_incomplete,
# formula = fml_imp_2014,
# random.L1 = "full",
# n.burn = 5000,
# n.iter = 2000,
# m = 60,
# seed = 12345
# ) # took around 23 hours
load("/Volumes/LGBT Project data/Multiple Imputation/imp_final_2014.RData")
summary( imp_final_2014 )
Call:
jomoImpute(data = d_2014_incomplete, formula = fml_imp_2014,
random.L1 = "full", n.burn = 5000, n.iter = 2000, m = 60,
seed = 12345)
Cluster variable: weight_strata
Target variables: sexual_identity_fluidity sexual_identity_2014 personal_support living_alone education income
Fixed effect predictors: (Intercept) age sex country_of_birth marital_status age:sex
Random effect predictors: (Intercept)
Performed 5000 burn-in iterations, and generated 60 imputed data sets,
each 2000 iterations apart.
Potential scale reduction (Rhat, imputation phase):
Min 25% Mean Median 75% Max
Beta: 1.000 1.002 1.028 1.007 1.019 1.334
Psi: 1.000 1.000 1.008 1.003 1.010 1.057
Sigma: 1.000 1.000 1.839 1.133 1.695 24.201
Largest potential scale reduction:
Beta: [2,2], Psi: [8,2], Sigma: [52,2]
Missing data per variable:
weight_strata sexual_identity_fluidity sexual_identity_2014 personal_support living_alone education income
MD% 0 41.2 9.0 8.2 7.5 0.7 0.2
sampling_strata_region calibrated_weight no.of.population age sex country_of_birth marital_status
MD% 0 0 0 0 0 0 0
plot( imp_final_2014, trace = "all", print = "beta" )

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

2.3. Validate imputed data
# extract imputed datasets
original_data_2014 <- mitmlComplete( imp_final_2014, print = 0 ) # extract original incomplete dataset
implist_2014 <- mitmlComplete( imp_final_2014, print = "all" ) # extract all imputed datasets
original_data_2014$imputation <- "0"
all_data_2014 <- bind_rows( original_data_2014,
bind_rows( implist_2014, .id = "imputation" ) ) # merge datasets
all_data_2014$imputation <- as.numeric( all_data_2014$imputation )
summary( all_data_2014 )
weight_strata sexual_identity_fluidity sexual_identity_2014 personal_support living_alone
15 : 55571 0 :1234374 Heterosexual :1205793 yes :1192975 yes : 282997
7 : 55510 1 : 113718 Homosexual : 18107 no : 162460 no :1072593
19 : 55144 NA's: 9158 Bisexual : 28311 NA's: 1815 NA's: 1660
4 : 55022 None of the above: 103038
22 : 54961 NA's : 2001
9 : 54839
(Other):1026203
education income sampling_strata_region calibrated_weight no.of.population age
<=9 years :199571 Min. :-49053 Lidingö : 64721 Min. : 8.497 Min. : 7463 Min. :16.00
10-12 years:493886 1st Qu.: 2190 Nacka : 64538 1st Qu.: 40.581 1st Qu.: 29879 1st Qu.:37.00
>=13 years :663634 Median : 3093 Botkyrka: 48190 Median : 66.588 Median : 44852 Median :51.00
NA's : 159 Mean : 3799 Täby : 39711 Mean : 79.311 Mean : 46291 Mean :51.18
3rd Qu.: 4263 Danderyd: 38003 3rd Qu.:103.469 3rd Qu.: 62422 3rd Qu.:66.00
Max. :204536 Bromma : 37210 Max. :450.081 Max. :108458 Max. :99.00
NA's :42 (Other) :1064877
sex country_of_birth marital_status imputation
Male :601887 Sweden :1108858 Never married :446947 Min. : 0
Female:755363 Europe : 139202 Currently married:663192 1st Qu.:15
Outside Europe: 109190 Other :247111 Median :30
Mean :30
3rd Qu.:45
Max. :60
# sexual identity in 2014
ggplot( all_data_2014[ !is.na( all_data_2014$sexual_identity_2014 ), ],
aes( fill = sexual_identity_2014, x = imputation ) ) +
geom_bar( position = "fill" ) +
scale_y_continuous( labels = scales::percent ) +
scale_fill_discrete( name = "Sexual identity in 2014" ) +
labs(
x = "Imputation number",
y = "Proportion",
caption = "Notes: Imputation number 0 represents the original incomplete dataset." ) +
theme_classic() +
theme( axis.title.x = element_text( family = "Arial", size = 11 ),
axis.text.x = element_text( family = "Arial", size = 11 ),
axis.text.y = element_text( family = "Arial", size = 11 ),
axis.title.y = element_text( family = "Arial", size = 11 ),
legend.text = element_text( family = "Arial", size = 10 ),
legend.title = element_text( family = "Arial", size = 10 ),
legend.position = "bottom",
plot.caption = element_text( family = "Arial", size = 10, hjust = 0 )
)

# change in sexual identity during 2014-2021
ggplot( all_data_2014[ !is.na( all_data_2014$sexual_identity_fluidity ), ],
aes( fill = sexual_identity_fluidity, x = imputation ) ) +
geom_bar( position = "fill" ) +
scale_y_continuous( labels = scales::percent ) +
scale_fill_discrete( name = "Change in sexual identity during 2014-2021", labels = c( "No", "Yes" ) ) +
labs(
x = "Imputation number",
y = "Proportion",
caption = "Notes: Imputation number 0 represents the original incomplete dataset." ) +
theme_classic() +
theme( axis.title.x = element_text( family = "Arial", size = 11 ),
axis.text.x = element_text( family = "Arial", size = 11 ),
axis.text.y = element_text( family = "Arial", size = 11 ),
axis.title.y = element_text( family = "Arial", size = 11 ),
legend.text = element_text( family = "Arial", size = 10 ),
legend.title = element_text( family = "Arial", size = 10 ),
legend.position = "bottom",
plot.caption = element_text( family = "Arial", size = 10, hjust = 0 )
)

# education
ggplot( all_data_2014[ !is.na( all_data_2014$education ), ],
aes( fill = education, x = imputation ) ) +
geom_bar( position = "fill" ) +
scale_y_continuous( labels = scales::percent ) +
scale_fill_discrete( name = "Level of education" ) +
labs(
x = "Imputation number",
y = "Proportion",
caption = "Notes: Imputation number 0 represents the original incomplete dataset." ) +
theme_classic() +
theme( axis.title.x = element_text( family = "Arial", size = 11 ),
axis.text.x = element_text( family = "Arial", size = 11 ),
axis.text.y = element_text( family = "Arial", size = 11 ),
axis.title.y = element_text( family = "Arial", size = 11 ),
legend.text = element_text( family = "Arial", size = 10 ),
legend.title = element_text( family = "Arial", size = 10 ),
legend.position = "bottom",
plot.caption = element_text( family = "Arial", size = 10, hjust = 0 )
)

# income
summary( all_data_2014$income )
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
-49053 2190 3093 3799 4263 204536 42
nrow( all_data_2014[ all_data_2014$income < 0 & !is.na( all_data_2014$income ), ] ) # 564 imputed values are negative
[1] 564
# living status
ggplot( all_data_2014[ !is.na( all_data_2014$living_alone ), ],
aes( fill = living_alone, x = imputation ) ) +
geom_bar( position = "fill" ) +
scale_y_continuous( labels = scales::percent ) +
scale_fill_discrete( name = "Living alone", labels = c( "Yes", "No" ) ) +
labs(
x = "Imputation number",
y = "Proportion",
caption = "Notes: Imputation number 0 represents the original incomplete dataset." ) +
theme_classic() +
theme( axis.title.x = element_text( family = "Arial", size = 11 ),
axis.text.x = element_text( family = "Arial", size = 11 ),
axis.text.y = element_text( family = "Arial", size = 11 ),
axis.title.y = element_text( family = "Arial", size = 11 ),
legend.text = element_text( family = "Arial", size = 10 ),
legend.title = element_text( family = "Arial", size = 10 ),
legend.position = "bottom",
plot.caption = element_text( family = "Arial", size = 10, hjust = 0 )
)

# personal support
ggplot( all_data_2014[ !is.na( all_data_2014$personal_support ), ],
aes( fill = personal_support, x = imputation ) ) +
geom_bar( position = "fill" ) +
scale_y_continuous( labels = scales::percent ) +
scale_fill_discrete( name = "Personal support", labels = c( "Yes", "No" ) ) +
labs(
x = "Imputation number",
y = "Proportion",
caption = "Notes: Imputation number 0 represents the original incomplete dataset." ) +
theme_classic() +
theme( axis.title.x = element_text( family = "Arial", size = 11 ),
axis.text.x = element_text( family = "Arial", size = 11 ),
axis.text.y = element_text( family = "Arial", size = 11 ),
axis.title.y = element_text( family = "Arial", size = 11 ),
legend.text = element_text( family = "Arial", size = 10 ),
legend.title = element_text( family = "Arial", size = 10 ),
legend.position = "bottom",
plot.caption = element_text( family = "Arial", size = 10, hjust = 0 )
)
